# 1、创建环境
from pyspark.context import SparkContext

sc = SparkContext(master='local', appName='demo2_map')

# 2、读取数据
students_rdd = sc.textFile("../../data/students.txt")

# 统计每个班级的平均年龄

# 取出班级和年龄
kv_rdd = students_rdd.map(lambda student: (student.split(",")[-1], int(student.split(",")[2])))

# groupBy: 指定一个字段进行分组
group_by_rdd = kv_rdd.groupBy(lambda kv: kv[0])

# ('文科六班', [('文科六班', 22), ('文科六班', 23)])
def avg_age_fun(kv):
    clazz = kv[0]
    lines = kv[1]
    # 取出所有年龄
    ages = [i[1] for i in lines]
    avg_age = round(sum(ages) / len(ages), 1)
    return clazz, avg_age


group_by_rdd.map(avg_age_fun).foreach(print)